bitkeeper revision 1.1044 (40ec19236iHRt47R5UsW46zQ42dHIw)
authorgm281@boulderdash.cl.cam.ac.uk <gm281@boulderdash.cl.cam.ac.uk>
Wed, 7 Jul 2004 15:39:15 +0000 (15:39 +0000)
committergm281@boulderdash.cl.cam.ac.uk <gm281@boulderdash.cl.cam.ac.uk>
Wed, 7 Jul 2004 15:39:15 +0000 (15:39 +0000)
Implementation of a new scheduler. Based on BVT (Borrowed Virtual Time) but trying to give more fair allocation of CPU for diverse environments (CPU-bound domains running against I/O bound ones). For that reason I called it Fair BVT (or FBVT for short). The BVT implementation gave the basic implementation. This changeset contains also the interface to control the scheduler. Unfortunatelly the contex switch allowance cannot be changed yet (a bug). The parameters introduced in the scheduler are likely to change in near future (after running tests).

17 files changed:
.rootkeys
tools/libxc/Makefile
tools/libxc/xc.h
tools/libxc/xc_fbvtsched.c [new file with mode: 0644]
tools/python/xen/lowlevel/xc/xc.c
tools/python/xen/xend/XendClient.py
tools/python/xen/xend/XendDomain.py
tools/python/xen/xend/XendNode.py
tools/python/xen/xend/server/SrvDomain.py
tools/python/xen/xend/server/SrvNode.py
tools/python/xen/xm/main.py
xen/common/sched_bvt.c
xen/common/sched_fair_bvt.c [new file with mode: 0644]
xen/common/schedule.c
xen/include/hypervisor-ifs/sched_ctl.h
xen/include/xen/sched-if.h
xen/include/xen/sched.h

index feb8ec1995065d41089b6f71b9ada76491a03ac9..8a288f2c982dbf6b6b9229c96ec8f88e0d364848 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/libxc/xc_domain.c
 40278d99BLsfUv3qxv0I8C1sClZ0ow tools/libxc/xc_elf.h
 403e0977Bjsm_e82pwvl9VvaJxh8Gg tools/libxc/xc_evtchn.c
+40ec1922Nq_Rur5KUH0MvRNKczPGxg tools/libxc/xc_fbvtsched.c
 40e03333Eegw8czSWvHsbKxrRZJjRA tools/libxc/xc_io.c
 40e03333vrWGbLAhyJjXlqCHaJt7eA tools/libxc/xc_io.h
 3fbba6dbNCU7U6nsMYiXzKkp3ztaJg tools/libxc/xc_linux_build.c
 3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c
 4064773cJ31vZt-zhbSoxqft1Jaw0w xen/common/sched_atropos.c
 40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
+40ec1922He_dRhVJdOicTcHvT8v1NQ xen/common/sched_fair_bvt.c
 40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
 3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
 405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c
index bf81b86c6d297f236a30cf0996c17e38204676f2..e0e7046cd97f59b8376fdf7a364449c00635e70b 100644 (file)
@@ -23,6 +23,7 @@ INCLUDES += -I $(XEN_LIBXUTIL)
 SRCS     :=
 SRCS     += xc_atropos.c
 SRCS     += xc_bvtsched.c
+SRCS     += xc_fbvtsched.c
 SRCS     += xc_domain.c
 SRCS     += xc_evtchn.c
 SRCS     += xc_io.c
index f9692607bd3194254c43186c51b07b67e01bc83d..d95a7353722b3d5b5a871902d863a3f2715704bb 100644 (file)
@@ -113,6 +113,26 @@ int xc_bvtsched_domain_get(int xc_handle,
                            unsigned long *warpl,
                            unsigned long *warpu);
 
+int xc_fbvtsched_global_set(int xc_handle,
+                           unsigned long ctx_allow);
+
+int xc_fbvtsched_domain_set(int xc_handle,
+                           u32 domid,
+                           unsigned long mcuadv,
+                           unsigned long warp,
+                           unsigned long warpl,
+                           unsigned long warpu);
+
+int xc_fbvtsched_global_get(int xc_handle,
+                           unsigned long *ctx_allow);
+
+int xc_fbvtsched_domain_get(int xc_handle,
+                           u32 domid,
+                           unsigned long *mcuadv,
+                           unsigned long *warp,
+                           unsigned long *warpl,
+                           unsigned long *warpu);
+
 int xc_atropos_domain_set(int xc_handle,
                           u32 domid,
                           u64 period, u64 slice, u64 latency,
diff --git a/tools/libxc/xc_fbvtsched.c b/tools/libxc/xc_fbvtsched.c
new file mode 100644 (file)
index 0000000..55adafe
--- /dev/null
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * xc_fbvtsched.c
+ * 
+ * API for manipulating parameters of the Fair Borrowed Virtual Time scheduler.
+ * 
+ * Copyright (c) 2004, G. Milos
+ * Based on K. Fraiser's xc_bvtsched.c
+ */
+
+#include "xc_private.h"
+
+int xc_fbvtsched_global_set(int xc_handle,
+                           unsigned long ctx_allow)
+{
+    dom0_op_t op;
+
+    op.cmd = DOM0_SCHEDCTL;
+    op.u.schedctl.sched_id = SCHED_FBVT;
+    op.u.schedctl.direction = SCHED_INFO_PUT;
+    op.u.schedctl.u.fbvt.ctx_allow = ctx_allow;
+
+    return do_dom0_op(xc_handle, &op);
+}
+
+int xc_fbvtsched_global_get(int xc_handle,
+                           unsigned long *ctx_allow)
+{
+    dom0_op_t op;
+    int ret;
+    
+    op.cmd = DOM0_SCHEDCTL;
+    op.u.schedctl.sched_id = SCHED_FBVT;
+    op.u.schedctl.direction = SCHED_INFO_GET;
+
+    ret = do_dom0_op(xc_handle, &op);
+
+    *ctx_allow = op.u.schedctl.u.fbvt.ctx_allow;
+
+    return ret;
+}
+
+int xc_fbvtsched_domain_set(int xc_handle,
+                           u32 domid,
+                           unsigned long mcuadv,
+                           unsigned long warp,
+                           unsigned long warpl,
+                           unsigned long warpu)
+{
+    dom0_op_t op;
+    struct fbvt_adjdom *fbvtadj = &op.u.adjustdom.u.fbvt;
+
+    op.cmd = DOM0_ADJUSTDOM;
+    op.u.adjustdom.domain  = (domid_t)domid;
+    op.u.adjustdom.sched_id = SCHED_FBVT;
+    op.u.adjustdom.direction = SCHED_INFO_PUT;
+
+    fbvtadj->mcu_adv = mcuadv;
+    fbvtadj->warp    = warp;
+    fbvtadj->warpl   = warpl;
+    fbvtadj->warpu   = warpu;
+    return do_dom0_op(xc_handle, &op);
+}
+
+
+int xc_fbvtsched_domain_get(int xc_handle,
+                           u32 domid,
+                           unsigned long *mcuadv,
+                           unsigned long *warp,
+                           unsigned long *warpl,
+                           unsigned long *warpu)
+{
+    
+    dom0_op_t op;
+    int ret;
+    struct fbvt_adjdom *adjptr = &op.u.adjustdom.u.fbvt;
+
+    op.cmd = DOM0_ADJUSTDOM;
+    op.u.adjustdom.domain  = (domid_t)domid;
+    op.u.adjustdom.sched_id = SCHED_FBVT;
+    op.u.adjustdom.direction = SCHED_INFO_GET;
+
+    ret = do_dom0_op(xc_handle, &op);
+
+    *mcuadv = adjptr->mcu_adv;
+    *warp   = adjptr->warp;
+    *warpl  = adjptr->warpl;
+    *warpu  = adjptr->warpu;
+    return ret;
+}
index 9cce7b060d7d472d86314354c0c467ab9f258d86..eb47e55dfd554e1ebea278c2c31d31c09c0e8f9b 100644 (file)
@@ -479,6 +479,92 @@ static PyObject *pyxc_bvtsched_domain_get(PyObject *self,
                          "warpu",  warpu);
 }
 
+static PyObject *pyxc_fbvtsched_global_set(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+
+    unsigned long ctx_allow;
+
+    static char *kwd_list[] = { "ctx_allow", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "l", kwd_list, &ctx_allow) )
+        return NULL;
+
+    if ( xc_fbvtsched_global_set(xc->xc_handle, ctx_allow) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_fbvtsched_global_get(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    
+    unsigned long ctx_allow;
+    
+    if ( !PyArg_ParseTuple(args, "") )
+        return NULL;
+    
+    if ( xc_fbvtsched_global_get(xc->xc_handle, &ctx_allow) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    return Py_BuildValue("s:l", "ctx_allow", ctx_allow);
+}
+
+static PyObject *pyxc_fbvtsched_domain_set(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+
+    u32           dom;
+    unsigned long mcuadv, warp, warpl, warpu;
+
+    static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
+                                "warpu", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "illll", kwd_list,
+                                      &dom, &mcuadv, &warp, &warpl, &warpu) )
+        return NULL;
+
+    if ( xc_fbvtsched_domain_set(xc->xc_handle, dom, mcuadv, 
+                                warp, warpl, warpu) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_fbvtsched_domain_get(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    u32 dom;
+    unsigned long mcuadv, warp, warpl, warpu;
+    
+    static char *kwd_list[] = { "dom", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i", kwd_list, &dom) )
+        return NULL;
+    
+    if ( xc_fbvtsched_domain_get(xc->xc_handle, dom, &mcuadv, &warp,
+                                &warpl, &warpu) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    return Py_BuildValue("{s:i,s:l,s:l,s:l,s:l}",
+                         "domain", dom,
+                         "mcuadv", mcuadv,
+                         "warp",   warp,
+                         "warpl",  warpl,
+                         "warpu",  warpu);
+}
+
 static PyObject *pyxc_evtchn_bind_interdomain(PyObject *self,
                                               PyObject *args,
                                               PyObject *kwds)
@@ -973,6 +1059,44 @@ static PyMethodDef pyxc_methods[] = {
       " warpl  [long]: Warp limit,\n"
     },
 
+    { "fbvtsched_global_set",
+      (PyCFunction)pyxc_fbvtsched_global_set,
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set global tuning parameters for Fair Borrowed Virtual Time scheduler.\n"
+      " ctx_allow [int]: Minimal guaranteed quantum.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "fbvtsched_global_get",
+      (PyCFunction)pyxc_fbvtsched_global_get,
+      METH_KEYWORDS, "\n"
+      "Get global tuning parameters for FBVT scheduler.\n"
+      "Returns: [dict]:\n"
+      " ctx_allow [int]: context switch allowance\n" },
+
+    { "fbvtsched_domain_set",
+      (PyCFunction)pyxc_fbvtsched_domain_set,
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set per-domain tuning parameters for Fair Borrowed Virtual Time scheduler.\n"
+      " dom    [int]: Identifier of domain to be tuned.\n"
+      " mcuadv [int]: Proportional to the inverse of the domain's weight.\n"
+      " warp   [int]: How far to warp domain's EVT on unblock.\n"
+      " warpl  [int]: How long the domain can run warped.\n"
+      " warpu  [int]: How long before the domain can warp again.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "fbvtsched_domain_get",
+      (PyCFunction)pyxc_fbvtsched_domain_get,
+      METH_KEYWORDS, "\n"
+      "Get per-domain tuning parameters under the FBVT scheduler.\n"
+      " dom [int]: Identifier of domain to be queried.\n"
+      "Returns [dict]:\n"
+      " domain [int]:  Domain ID.\n"
+      " mcuadv [long]: MCU Advance.\n"
+      " warp   [long]: Warp.\n"
+      " warpu  [long]: Unwarp requirement.\n"
+      " warpl  [long]: Warp limit,\n"
+    },
+
     { "atropos_domain_set",
       (PyCFunction)pyxc_atropos_domain_set,
       METH_KEYWORDS, "\n"
index 13dc3dbb1eeac682b1931a51ef9379359782b1ea..e8d62fbc555d6c727589b739c7d5af6c0def14f6 100644 (file)
@@ -171,10 +171,15 @@ class Xend:
                          {'op'      : 'cpu_rrobin_slice_set',
                           'slice'   : slice })
     
-    def xend_node_cpu_bvt_slice_set(self, slice):
+    def xend_node_cpu_bvt_slice_set(self, ctx_allow):
         return xend_call(self.nodeurl(),
                          {'op'      : 'cpu_bvt_slice_set',
-                          'slice'   : slice })
+                          'ctx_allow'   : ctx_allow })
+    
+    def xend_node_cpu_fbvt_slice_set(self, ctx_allow):
+        return xend_call(self.nodeurl(),
+                         {'op'      : 'cpu_fbvt_slice_set',
+                          'ctx_allow'   : ctx_allow })
 
     def xend_domains(self):
         return xend_get(self.domainurl())
@@ -226,10 +231,19 @@ class Xend:
     def xend_domain_cpu_bvt_set(self, id, mcuadv, warp, warpl, warpu):
         return xend_call(self.domainurl(id),
                          {'op'      : 'cpu_bvt_set',
-                          'mcuadv'  : mvuadv,
+                          'mcuadv'  : mcuadv,
                           'warp'    : warp,
                           'warpl'   : warpl,
                           'warpu'   : warpu })
+    
+    def xend_domain_cpu_fbvt_set(self, id, mcuadv, warp, warpl, warpu):
+        return xend_call(self.domainurl(id),
+                         {'op'      : 'cpu_fbvt_set',
+                          'mcuadv'  : mcuadv,
+                          'warp'    : warp,
+                          'warpl'   : warpl,
+                          'warpu'   : warpu })
+
 
     def xend_domain_cpu_atropos_set(self, id, period, slice, latency, xtratime):
         return xend_call(self.domainurl(id),
index 328450a6b5753693389b0165f1750ceb53c8deaf..d75405861a481a382d97c43b5a9ce0aebac23546 100644 (file)
@@ -363,6 +363,19 @@ class XendDomain:
         dom = int(dom)
         return xc.bvtsched_domain_get(dom)
     
+    def domain_cpu_fbvt_set(self, dom, mcuadv, warp, warpl, warpu):
+        """Set FBVT (Fair Borrowed Virtual Time) scheduler parameters for a domain.
+        """
+        dom = int(dom)
+        return xc.fbvtsched_domain_set(dom=dom, mcuadv=mcuadv,
+                                      warp=warp, warpl=warpl, warpu=warpu)
+
+    def domain_cpu_fbvt_get(self, dom):
+        """Get FBVT (Fair Borrowed Virtual Time) scheduler parameters for a domain.
+        """
+        dom = int(dom)
+        return xc.fbvtsched_domain_get(dom)
+        
     def domain_cpu_atropos_set(self, dom, period, slice, latency, xtratime):
         """Set Atropos scheduler parameters for a domain.
         """
index 27ea8ef187236643b9d1efeb34a3ee46dd416699..4073d753c7495528175c5e17f90974bb5b5d4dfd 100644 (file)
@@ -24,12 +24,22 @@ class XendNode:
     def notify(self, uri):
         return 0
     
-    def cpu_bvt_slice_set(self, slice):
+    def cpu_bvt_slice_set(self, ctx_allow):
         ret = 0
         #ret = self.xc.bvtsched_global_set(ctx_allow=slice)
         return ret
 
-    def cpu_bvt_slice_get(self, slice):
+    def cpu_bvt_slice_get(self, ctx_allow):
+        ret = 0
+        #ret = self.xc.bvtsched_global_get()
+        return ret
+    
+    def cpu_fbvt_slice_set(self, ctx_allow):
+        ret = 0
+        #ret = self.xc.bvtsched_global_set(ctx_allow=slice)
+        return ret
+
+    def cpu_fbvt_slice_get(self, ctx_allow):
         ret = 0
         #ret = self.xc.bvtsched_global_get()
         return ret
index 156198bd70d0700417d7c93bf6f2d4ab621cff52..122f2f0cc42d0898ea665a837f16925390cd4577 100644 (file)
@@ -70,6 +70,16 @@ class SrvDomain(SrvDir):
                      ['warpu', 'int']])
         val = fn(req.args, {'dom': self.dom.id})
         return val
+    
+    def op_cpu_fbvt_set(self, op, req):
+        fn = FormFn(self.xd.domain_cpu_fbvt_set,
+                    [['dom', 'int'],
+                     ['mcuadv', 'int'],
+                     ['warp', 'int'],
+                     ['warpl', 'int'],
+                     ['warpu', 'int']])
+        val = fn(req.args, {'dom': self.dom.id})
+        return val
 
     def op_cpu_atropos_set(self, op, req):
         fn = FormFn(self.xd.domain_cpu_atropos_set,
index 69747d80c133723952e817ed790d094a167b81cf..0e8807d2b5c920ca0a57d7f355179a12f17850e3 100644 (file)
@@ -4,6 +4,7 @@ import os
 from SrvDir import SrvDir
 from xen.xend import sxp
 from xen.xend import XendNode
+from xen.xend.Args import FormFn
 
 class SrvNode(SrvDir):
     """Information about the node.
@@ -29,7 +30,13 @@ class SrvNode(SrvDir):
 
     def op_cpu_bvt_slice_set(self, op, req):
         fn = FormFn(self.xn.cpu_bvt_slice_set,
-                    [['slice', 'int']])
+                    [['ctx_allow', 'int']])
+        val = fn(req.args, {})
+        return val
+    
+    def op_cpu_fbvt_slice_set(self, op, req):
+        fn = FormFn(self.xn.cpu_fbvt_slice_set,
+                    [['ctx_allow', 'int']])
         val = fn(req.args, {})
         return val
 
index 3ab5d23cf756f6ea326a1f775c4105b307898d75..4e8f5b68a30b5bbb20b268f15fbd5ce6462f16b9 100644 (file)
@@ -347,19 +347,53 @@ xm.prog(ProgBvt)
 
 class ProgBvtslice(Prog):
     group = 'scheduler'
-    name = "bvtslice"
-    info = """Set the BVT scheduler slice."""
+    name = "bvt_ctxallow"
+    info = """Set the BVT scheduler context switch allowance."""
 
     def help(self, args):
-        print args[0], 'SLICE'
-        print '\nSet Borrowed Virtual Time scheduler slice.'
+        print args[0], 'CTX_ALLOW'
+        print '\nSet Borrowed Virtual Time scheduler context switch allowance.'
 
     def main(self, args):
-        if len(args) < 2: self.err('%s: Missing slice' % args[0])
+        if len(args) < 2: self.err('%s: Missing context switch allowance'
+                                                            % args[0])
         server.xend_node_cpu_bvt_slice_set(slice)
 
 xm.prog(ProgBvtslice)
 
+class ProgFbvt(Prog):
+    group = 'scheduler'
+    name = "fbvt"
+    info = """Set FBVT scheduler parameters."""
+    
+    def help(self, args):
+        print args[0], "DOM MCUADV WARP WARPL WARPU"
+        print '\nSet Fair Borrowed Virtual Time scheduler parameters.'
+
+    def main(self, args):
+        if len(args) != 6: self.err("%s: Invalid argument(s)" % args[0])
+        v = map(int, args[1:6])
+        server.xend_domain_cpu_fbvt_set(*v)
+
+xm.prog(ProgFbvt)
+
+class ProgFbvtslice(Prog):
+    group = 'scheduler'
+    name = "fbvt_ctxallow"
+    info = """Set the FBVT scheduler context switch allowance."""
+
+    def help(self, args):
+        print args[0], 'CTX_ALLOW'
+        print '\nSet Fair Borrowed Virtual Time scheduler context switch allowance.'
+
+    def main(self, args):
+        if len(args) < 2: self.err('%s: Missing context switch allowance.' 
+                                                                % args[0])
+        server.xend_node_cpu_fbvt_slice_set(slice)
+
+xm.prog(ProgFbvtslice)
+
+
 class ProgAtropos(Prog):
     group = 'scheduler'
     name= "atropos"
index 6ea9503c70ae9bf01a6e99a208eebe954859290f..b67aa72d927ba0da925b15b68c54b44137a8abd4 100644 (file)
@@ -380,7 +380,6 @@ static task_slice_t bvt_do_schedule(s_time_t now)
     next->min_slice = ctx_allow;
     ret.task = next;
     ret.time = r_time;
-
     return ret;
 }
 
diff --git a/xen/common/sched_fair_bvt.c b/xen/common/sched_fair_bvt.c
new file mode 100644 (file)
index 0000000..f20b754
--- /dev/null
@@ -0,0 +1,538 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: common/schedule.c
+ *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
+ *
+ * Description: CPU scheduling
+ *              implements A Borrowed Virtual Time scheduler.
+ *              (see Duda & Cheriton SOSP'99)
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/ac_timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/slab.h>
+
+
+/* all per-domain BVT-specific scheduling info is stored here */
+struct fbvt_dom_info
+{
+    unsigned long mcu_advance;      /* inverse of weight */
+    u32           avt;              /* actual virtual time */
+    u32           evt;              /* effective virtual time */
+    u32                      time_slept;           /* records amount of time slept, used for scheduling */
+    u32                      vtb;                  /* virtual time bonus */
+    int           warpback;         /* warp?  */
+    long          warp;             /* virtual time warp */
+    long          warpl;            /* warp limit */
+    long          warpu;            /* unwarp time requirement */
+    s_time_t      warped;           /* time it ran warped last time */
+    s_time_t      uwarped;          /* time it ran unwarped last time */
+};
+
+struct fbvt_cpu_info
+{
+    unsigned long svt; /* XXX check this is unsigned long! */
+};
+
+
+#define FBVT_INFO(p)   ((struct fbvt_dom_info *)(p)->sched_priv)
+#define CPU_INFO(cpu) ((struct fbvt_cpu_info *)(schedule_data[cpu]).sched_priv)
+#define CPU_SVT(cpu)  (CPU_INFO(cpu)->svt)
+
+#define MCU            (s32)MICROSECS(100)    /* Minimum unit */
+#define MCU_ADVANCE    10                     /* default weight */
+#define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
+static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
+
+/* SLAB cache for struct fbvt_dom_info objects */
+static kmem_cache_t *dom_info_cache;
+
+/*
+ * Calculate the effective virtual time for a domain. Take into account 
+ * warping limits
+ */
+static void __calc_evt(struct fbvt_dom_info *inf)
+{
+    s_time_t now = NOW();
+
+    if ( inf->warpback ) 
+    {
+        if ( ((now - inf->warped) < inf->warpl) &&
+             ((now - inf->uwarped) > inf->warpu) )
+        {
+            /* allowed to warp */
+            inf->evt = inf->avt - inf->warp;
+        } 
+        else 
+        {
+            /* warped for too long -> unwarp */
+            inf->evt      = inf->avt;
+            inf->uwarped  = now;
+            inf->warpback = 0;
+        }
+    } 
+    else 
+    {
+        inf->evt = inf->avt;
+    }
+}
+
+/**
+ * fbvt_alloc_task - allocate FBVT private structures for a task
+ * @p:              task to allocate private structures for
+ *
+ * Returns non-zero on failure.
+ */
+int fbvt_alloc_task(struct domain *p)
+{
+    p->sched_priv = kmem_cache_alloc(dom_info_cache);
+    if ( p->sched_priv == NULL )
+        return -1;
+    
+    return 0;
+}
+
+/*
+ * Add and remove a domain
+ */
+void fbvt_add_task(struct domain *p) 
+{
+    struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+    ASSERT(inf != NULL);
+    ASSERT(p   != NULL);
+
+    inf->mcu_advance = MCU_ADVANCE;
+    if ( p->domain == IDLE_DOMAIN_ID )
+    {
+        inf->avt = inf->evt = ~0U;
+    } 
+    else 
+    {
+        /* Set avt and evt to system virtual time. */
+        inf->avt         = CPU_SVT(p->processor);
+        inf->evt         = CPU_SVT(p->processor);
+        /* Set some default values here. */
+               inf->vtb             = 0;
+               inf->time_slept  = 0;
+        inf->warpback    = 0;
+        inf->warp        = 0;
+        inf->warpl       = 0;
+        inf->warpu       = 0;
+    }
+
+    return;
+}
+
+/**
+ * fbvt_free_task - free FBVT private structures for a task
+ * @p:             task
+ */
+void fbvt_free_task(struct domain *p)
+{
+    ASSERT( p->sched_priv != NULL );
+    kmem_cache_free( dom_info_cache, p->sched_priv );
+}
+
+
+void fbvt_wake_up(struct domain *p)
+{
+    struct fbvt_dom_info *inf = FBVT_INFO(p);
+    s32 io_warp;
+
+    ASSERT(inf != NULL);
+    
+
+    /* set the BVT parameters */
+    if (inf->avt < CPU_SVT(p->processor))
+    {
+               /*
+                *We want IO bound processes to gain
+                *dispatch precedence. It is especially for
+                *device driver domains. Therefore AVT should not be updated
+                *to SVT but to a value marginally smaller.
+                *Since frequently sleeping domains have high time_slept
+                *values, the virtual time can be determined as:
+                *SVT - const * TIME_SLEPT
+                */
+       
+               io_warp = (int)(0.5 * inf->time_slept);
+               if(io_warp > 10000) io_warp = 10000;
+
+               ASSERT(inf->time_slept + CPU_SVT(p->processor) > inf->avt + io_warp);
+               inf->time_slept += CPU_SVT(p->processor) - inf->avt - io_warp;
+        inf->avt = CPU_SVT(p->processor) - io_warp;
+    }
+
+    /* deal with warping here */
+    inf->warpback  = 1;
+    inf->warped    = NOW();
+    __calc_evt(inf);
+    __add_to_runqueue_head(p);
+}
+
+/* 
+ * Block the currently-executing domain until a pertinent event occurs.
+ */
+static void fbvt_do_block(struct domain *p)
+{
+    FBVT_INFO(p)->warpback = 0; 
+}
+
+/* Control the scheduler. */
+int fbvt_ctl(struct sched_ctl_cmd *cmd)
+{
+    struct fbvt_ctl *params = &cmd->u.fbvt;
+
+    if ( cmd->direction == SCHED_INFO_PUT )
+    { 
+        ctx_allow = params->ctx_allow;
+    }
+    else
+    {
+        params->ctx_allow = ctx_allow;
+    }
+    
+    return 0;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+int fbvt_adjdom(struct domain *p,
+               struct sched_adjdom_cmd *cmd)
+{
+    struct fbvt_adjdom *params = &cmd->u.fbvt;
+    unsigned long flags;
+
+    if ( cmd->direction == SCHED_INFO_PUT )
+    {
+        unsigned long mcu_adv = params->mcu_adv,
+            warp  = params->warp,
+            warpl = params->warpl,
+            warpu = params->warpu;
+        
+        struct fbvt_dom_info *inf = FBVT_INFO(p);
+        
+        DPRINTK("Get domain %u fbvt mcu_adv=%ld, warp=%ld, "
+                "warpl=%ld, warpu=%ld\n",
+                p->domain, inf->mcu_advance, inf->warp,
+                inf->warpl, inf->warpu );
+
+        /* Sanity -- this can avoid divide-by-zero. */
+        if ( mcu_adv == 0 )
+            return -EINVAL;
+        
+        spin_lock_irqsave(&schedule_lock[p->processor], flags);   
+        inf->mcu_advance = mcu_adv;
+        inf->warp = warp;
+        inf->warpl = warpl;
+        inf->warpu = warpu;
+
+        DPRINTK("Set domain %u fbvt mcu_adv=%ld, warp=%ld, "
+                "warpl=%ld, warpu=%ld\n",
+                p->domain, inf->mcu_advance, inf->warp,
+                inf->warpl, inf->warpu );
+
+        spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+    }
+    else if ( cmd->direction == SCHED_INFO_GET )
+    {
+        struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+        spin_lock_irqsave(&schedule_lock[p->processor], flags);   
+        params->mcu_adv = inf->mcu_advance;
+        params->warp    = inf->warp;
+        params->warpl   = inf->warpl;
+        params->warpu   = inf->warpu;
+        spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+    }
+    
+    return 0;
+}
+
+
+/* 
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ *   i.e., the domain with lowest EVT.
+ *   The runqueue should be ordered by EVT so that is easy.
+ */
+static task_slice_t fbvt_do_schedule(s_time_t now)
+{
+    struct domain *prev = current, *next = NULL, *next_prime, *p;
+    struct list_head   *tmp;
+    int                 cpu = prev->processor;
+    s32                 r_time;     /* time for new dom to run */
+    s32                 ranfor;     /* assume we never run longer than 2.1s! */
+    s32                 mcus;
+    u32                 next_evt, next_prime_evt, min_avt;
+    struct fbvt_dom_info *prev_inf       = FBVT_INFO(prev),
+                        *p_inf          = NULL,
+                        *next_inf       = NULL,
+                        *next_prime_inf = NULL;
+    task_slice_t        ret;
+
+    ASSERT(prev->sched_priv != NULL);
+    ASSERT(prev_inf != NULL);
+
+    if ( likely(!is_idle_task(prev)) ) 
+    {
+        ranfor = (s32)(now - prev->lastschd);
+        /* Calculate mcu and update avt. */
+        mcus = (ranfor + MCU - 1) / MCU;
+    if(mcus * prev_inf->mcu_advance < prev_inf->vtb)
+       {
+           ASSERT(prev_inf->time_slept >= mcus * prev_inf->mcu_advance);
+       prev_inf->time_slept -= mcus * prev_inf->mcu_advance;
+       }
+       else
+       {
+           prev_inf->avt += mcus * prev_inf->mcu_advance - prev_inf->vtb;
+               
+           ASSERT(prev_inf->time_slept >= prev_inf->vtb);
+           prev_inf->time_slept -= prev_inf->vtb;
+       }
+        
+        __calc_evt(prev_inf);
+        
+        __del_from_runqueue(prev);
+        
+        if ( domain_runnable(prev) )
+            __add_to_runqueue_tail(prev);
+    }
+
+    /* We should at least have the idle task */
+    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
+
+    /*
+     * scan through the run queue and pick the task with the lowest evt
+     * *and* the task the second lowest evt.
+     * this code is O(n) but we expect n to be small.
+     */
+    next       = schedule_data[cpu].idle;
+    next_prime = NULL;
+
+    next_evt       = ~0U;
+    next_prime_evt = ~0U;
+    min_avt        = ~0U;
+
+    list_for_each ( tmp, &schedule_data[cpu].runqueue )
+    {
+        p     = list_entry(tmp, struct domain, run_list);
+        p_inf = FBVT_INFO(p);
+
+        if ( p_inf->evt < next_evt )
+        {
+            next_prime     = next;
+            next_prime_evt = next_evt;
+            next = p;
+            next_evt = p_inf->evt;
+        } 
+        else if ( next_prime_evt == ~0U )
+        {
+            next_prime_evt = p_inf->evt;
+            next_prime     = p;
+        } 
+        else if ( p_inf->evt < next_prime_evt )
+        {
+            next_prime_evt = p_inf->evt;
+            next_prime     = p;
+        }
+
+        /* Determine system virtual time. */
+        if ( p_inf->avt < min_avt )
+            min_avt = p_inf->avt;
+    }
+
+    /* Update system virtual time. */
+    if ( min_avt != ~0U )
+        CPU_SVT(cpu) = min_avt;
+
+    /* check for virtual time overrun on this cpu */
+    if ( CPU_SVT(cpu) >= 0xf0000000 )
+    {
+        u_long t_flags; 
+        write_lock_irqsave(&tasklist_lock, t_flags); 
+        for_each_domain ( p )
+        {
+            if ( p->processor == cpu )
+            {
+                p_inf = FBVT_INFO(p);
+                p_inf->evt -= 0xe0000000;
+                p_inf->avt -= 0xe0000000;
+            }
+        } 
+        write_unlock_irqrestore(&tasklist_lock, t_flags); 
+        CPU_SVT(cpu) -= 0xe0000000;
+    }
+
+    next_prime_inf = FBVT_INFO(next_prime);
+    next_inf       = FBVT_INFO(next);
+    
+    /* check for time_slept overrun for the domain we schedule to run*/
+    if(next_inf->time_slept >= 0xf0000000)
+    {
+        printk("Domain %d is assigned more CPU then it is able to use.\n"
+               "FBVT slept_time=%d, halving. Mcu_advance=%ld\n",next->domain, 
+               next_inf->time_slept, next_inf->mcu_advance);
+
+        next_inf->time_slept /= 2;
+    }
+
+
+   /*
+     * In here we decide on Virtual Time Bonus. The idea is, for the
+     * domains that have large time_slept values to be allowed to run
+     * for longer. Thus regaining the share of CPU originally allocated.
+     * This is acompanied by the warp mechanism (which moves IO-bound
+     * domains earlier in virtual time). Together this should give quite
+     * good control both for CPU and IO-bound domains.
+     */
+    next_inf->vtb = (int)(0.2 * next_inf->time_slept);
+    if(next_inf->vtb > 1000) next_inf->vtb = 1000;
+
+
+    /* work out time for next run through scheduler */
+    if ( is_idle_task(next) ) 
+    {
+        r_time = ctx_allow;
+        goto sched_done;
+    }
+
+    if ( (next_prime == NULL) || is_idle_task(next_prime) )
+    {
+        /* We have only one runnable task besides the idle task. */
+        r_time = 10 * ctx_allow;     /* RN: random constant */
+        goto sched_done;
+    }
+
+    /*
+     * If we are here then we have two runnable tasks.
+     * Work out how long 'next' can run till its evt is greater than
+     * 'next_prime's evt. Take context switch allowance into account.
+     */
+    ASSERT(next_prime_inf->evt >= next_inf->evt);
+   
+    r_time = ((next_prime_inf->evt + next_inf->vtb - next_inf->evt)/next_inf->mcu_advance)
+        + ctx_allow;
+
+    ASSERT(r_time >= ctx_allow);
+
+ sched_done:
+    next->min_slice = ctx_allow;
+    ret.task = next;
+    ret.time = r_time;
+    return ret;
+}
+
+
+static void fbvt_dump_runq_el(struct domain *p)
+{
+    struct fbvt_dom_info *inf = FBVT_INFO(p);
+    
+    printk("mcua=0x%04lX ev=0x%08X av=0x%08X sl=0x%08X vtb=0x%08X ",
+           inf->mcu_advance, inf->evt, inf->avt, inf->time_slept, inf->vtb);
+}
+
+static void fbvt_dump_settings(void)
+{
+    printk("FBVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
+}
+
+static void fbvt_dump_cpu_state(int i)
+{
+    printk("svt=0x%08lX ", CPU_SVT(i));
+}
+
+
+/* Initialise the data structures. */
+int fbvt_init_scheduler()
+{
+    int i;
+
+    for ( i = 0; i < NR_CPUS; i++ )
+    {
+        schedule_data[i].sched_priv = kmalloc(sizeof(struct fbvt_cpu_info));
+        if ( schedule_data[i].sched_priv == NULL )
+        {
+            printk("Failed to allocate FBVT scheduler per-CPU memory!\n");
+            return -1;
+        }
+
+        CPU_SVT(i) = 0; /* XXX do I really need to do this? */
+    }
+
+    dom_info_cache = kmem_cache_create("FBVT dom info",
+                                       sizeof(struct fbvt_dom_info),
+                                       0, 0, NULL, NULL);
+
+    if ( dom_info_cache == NULL )
+    {
+        printk("FBVT: Failed to allocate domain info SLAB cache");
+        return -1;
+    }
+
+    return 0;
+}
+
+static void fbvt_pause(struct domain *p)
+{
+    if( __task_on_runqueue(p) )
+    {
+        __del_from_runqueue(p);
+    }
+}
+
+static void fbvt_unpause(struct domain *p)
+{
+       struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+       if ( p->domain == IDLE_DOMAIN_ID )
+    {
+        inf->avt = inf->evt = ~0U;
+    } 
+    else 
+    {
+        /* Set avt to system virtual time. */
+        inf->avt         = CPU_SVT(p->processor);
+        /* Set some default values here. */
+               inf->vtb         = 0;
+               __calc_evt(inf);
+    }
+}
+
+struct scheduler sched_fbvt_def = {
+    .name     = "Fair Borrowed Virtual Time",
+    .opt_name = "fbvt",
+    .sched_id = SCHED_FBVT,
+    
+    .init_scheduler = fbvt_init_scheduler,
+    .alloc_task     = fbvt_alloc_task,
+    .add_task       = fbvt_add_task,
+    .free_task      = fbvt_free_task,
+    .wake_up        = fbvt_wake_up,
+    .do_block       = fbvt_do_block,
+    .do_schedule    = fbvt_do_schedule,
+    .control        = fbvt_ctl,
+    .adjdom         = fbvt_adjdom,
+    .dump_settings  = fbvt_dump_settings,
+    .dump_cpu_state = fbvt_dump_cpu_state,
+    .dump_runq_el   = fbvt_dump_runq_el,
+    .pause          = fbvt_pause,
+    .unpause       = fbvt_unpause,
+};
+
index 5c2ca1579aeb9afe86cd7b061945cf45d3e2959e..ec4971f4d093989875f5dc6330a75f73f8a9abdf 100644 (file)
@@ -71,8 +71,9 @@ schedule_data_t schedule_data[NR_CPUS];
  * TODO: It would be nice if the schedulers array could get populated
  * automagically without having to hack the code in here.
  */
-extern struct scheduler sched_bvt_def, sched_rrobin_def, sched_atropos_def;
+extern struct scheduler sched_bvt_def, sched_fbvt_def, sched_rrobin_def, sched_atropos_def;
 static struct scheduler *schedulers[] = { &sched_bvt_def,
+                                         &sched_fbvt_def,
                                           &sched_rrobin_def,
                                           &sched_atropos_def,
                                           NULL};
@@ -225,6 +226,25 @@ void domain_wake(struct domain *d)
     spin_unlock_irqrestore(&schedule_lock[cpu], flags);
 }
 
+/*
+ * Pausing a domain.
+ */
+void pause_domain(struct domain *domain)
+{
+       domain_sleep(domain);
+       SCHED_OP(pause, domain);        
+}
+
+
+/*
+ * Unpauseing a domain
+ */
+void unpause_domain(struct domain *domain)
+{
+       SCHED_OP(unpause, domain);
+       domain_wake(domain);
+}
+
 /* Block the currently-executing domain until a pertinent event occurs. */
 long do_block(void)
 {
@@ -361,6 +381,7 @@ void __enter_scheduler(void)
     rem_ac_timer(&schedule_data[cpu].s_timer);
     
     ASSERT(!in_irq());
+if(!__task_on_runqueue(prev)) printk("Domain %d not on runqueue\n",prev->domain);
     ASSERT(__task_on_runqueue(prev));
 
     if ( test_bit(DF_BLOCKED, &prev->flags) )
index 34e1d3866eb3ab45ec66d6e23b442ccdfad25d80..17daa7aaea9682b596ade300049e6655a75e0d21 100644 (file)
@@ -9,8 +9,9 @@
 
 /* Scheduler types */
 #define SCHED_BVT      0
-#define SCHED_ATROPOS  1
-#define SCHED_RROBIN   2
+#define SCHED_FBVT     1
+#define SCHED_ATROPOS  2
+#define SCHED_RROBIN   3
 
 /* these describe the intended direction used for a scheduler control or domain
  * command */
@@ -32,6 +33,12 @@ struct sched_ctl_cmd
             u32 ctx_allow;            /*  8: context switch allowance */
         } PACKED bvt;
 
+        struct fbvt_ctl
+        {
+            /* IN variables. */
+            u32 ctx_allow;            /*  8: context switch allowance */
+        } PACKED fbvt;
+
         struct rrobin_ctl
         {
             /* IN variables */
@@ -55,6 +62,14 @@ struct sched_adjdom_cmd
             u32 warpu;      /* 28: unwarp time requirement */
         } PACKED bvt;
 
+        struct fbvt_adjdom
+        {
+            u32 mcu_adv;    /* 16: mcu advance: inverse of weight */
+            u32 warp;       /* 20: time warp */
+            u32 warpl;      /* 24: warp limit */
+            u32 warpu;      /* 28: unwarp time requirement */
+        } PACKED fbvt;
+
         struct atropos_adjdom
         {
             u64 nat_period; /* 16 */
index b8c3b419139f187deb46198f00eca021cc147c0b..c547d8f28adf07c7f08c895331882f8bcd3d17fb 100644 (file)
@@ -50,6 +50,7 @@ struct scheduler
     void         (*dump_runq_el)   (struct domain *);
     int          (*prn_state)      (int);
     void         (*pause)          (struct domain *);
+       void             (*unpause)                (struct domain *);
 };
 
 /* per CPU scheduler information */
index 009ce5f5e399647cbb4b99251fa9afd7b4614b61..372580147753a862856be15f3b89f48f0e102bb1 100644 (file)
@@ -218,6 +218,8 @@ int  sched_id();
 void init_idle_task(void);
 void domain_wake(struct domain *d);
 void domain_sleep(struct domain *d);
+void pause_domain(struct domain *d);
+void unpause_domain(struct domain *d);
 
 void __enter_scheduler(void);
 
@@ -268,14 +270,14 @@ static inline void domain_pause(struct domain *d)
 {
     ASSERT(d != current);
     atomic_inc(&d->pausecnt);
-    domain_sleep(d);
+    pause_domain(d);
 }
 
 static inline void domain_unpause(struct domain *d)
 {
     ASSERT(d != current);
     if ( atomic_dec_and_test(&d->pausecnt) )
-        domain_wake(d);
+        unpause_domain(d);
 }
 
 static inline void domain_unblock(struct domain *d)
@@ -288,13 +290,13 @@ static inline void domain_pause_by_systemcontroller(struct domain *d)
 {
     ASSERT(d != current);
     if ( !test_and_set_bit(DF_CTRLPAUSE, &d->flags) )
-        domain_sleep(d);
+        pause_domain(d);
 }
 
 static inline void domain_unpause_by_systemcontroller(struct domain *d)
 {
     if ( test_and_clear_bit(DF_CTRLPAUSE, &d->flags) )
-        domain_wake(d);
+        unpause_domain(d);
 }